df_clean %>% 
  ggplot(aes(data_grant_share, data_sharing_perc, colour = pubs.vs.data)) +
  geom_point(size = 1.5) +
  scale_colour_viridis_c(option = "C")

# why does the right side not go up higher? is this the base funding that 
# stays stronger for those with not many grants?
p <- df_clean %>% 
  mutate(stats = paste("pubs:", n_publications, "grants:", total_grants)) %>%
  ggplot(aes(data_grant_share, data_sharing_perc, colour = factor(pubs.vs.data),
             label = stats)) +
  geom_point(alpha = .5) +
  scale_colour_viridis_d(option = "C") 
p

plotly::ggplotly(p)
df_clean %>% 
  mutate(stats = paste("pubs:", n_publications, "grants:", total_grants)) %>%
  ggplot(aes(data_grant_share, data_sharing_perc, label = stats)) +
  geom_point(alpha = .1) +
  facet_wrap(vars(factor(pubs.vs.data)))  

df_clean %>% 
  mutate(stats = paste("pubs:", n_publications, "grants:", total_grants)) %>%
  ggplot(aes(data_grant_share, data_sharing_perc, label = stats)) +
  geom_density_2d() +
  facet_wrap(vars(factor(pubs.vs.data)))  

Rewarding based on datasets simply adds more randomness.

df_clean %>% 
  ggplot(aes(data_grant_share, n_publications)) +
  geom_point()  +
  facet_wrap(vars(pubs.vs.data)) +
  labs(title = "Influence of incentive settings")

Modeling

m1 <- lm(n_publications ~ total_grants + data_grant_share, data = df_clean)
summary(m1)
## 
## Call:
## lm(formula = n_publications ~ total_grants + data_grant_share, 
##     data = df_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -130.89  -13.09    0.28   13.33  158.75 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      156.164550   0.963725  162.04   <2e-16 ***
## total_grants       5.903346   0.001857 3179.15   <2e-16 ***
## data_grant_share -28.204328   1.882182  -14.98   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.35 on 11997 degrees of freedom
## Multiple R-squared:  0.9988, Adjusted R-squared:  0.9988 
## F-statistic: 5.056e+06 on 2 and 11997 DF,  p-value: < 2.2e-16
# have to use rescale directly, since arm::standardize does not work with the
# targets workflow
m1 <- lm(n_publications ~ 
           arm::rescale(total_grants) + 
           arm::rescale(data_grant_share),
         data = df_clean)
summary(m1)
## 
## Call:
## lm(formula = n_publications ~ arm::rescale(total_grants) + arm::rescale(data_grant_share), 
##     data = df_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -130.89  -13.09    0.28   13.33  158.75 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     614.4616     0.2131 2882.88   <2e-16 ***
## arm::rescale(total_grants)     1355.8297     0.4265 3179.15   <2e-16 ***
## arm::rescale(data_grant_share)   -6.3907     0.4265  -14.98   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.35 on 11997 degrees of freedom
## Multiple R-squared:  0.9988, Adjusted R-squared:  0.9988 
## F-statistic: 5.056e+06 on 2 and 11997 DF,  p-value: < 2.2e-16
m2 <- lm(n_publications ~ total_grants + data_grant_share + 
           I(total_grants^2),
         data = df_clean)
summary(m2)
## 
## Call:
## lm(formula = n_publications ~ total_grants + data_grant_share + 
##     I(total_grants^2), data = df_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -163.622  -13.171   -0.035   13.007  159.018 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.579e+02  9.702e-01  162.80   <2e-16 ***
## total_grants       5.853e+00  4.674e-03 1252.20   <2e-16 ***
## data_grant_share  -2.777e+01  1.872e+00  -14.84   <2e-16 ***
## I(total_grants^2)  1.044e-04  8.874e-06   11.76   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.22 on 11996 degrees of freedom
## Multiple R-squared:  0.9988, Adjusted R-squared:  0.9988 
## F-statistic: 3.41e+06 on 3 and 11996 DF,  p-value: < 2.2e-16
# m2 fits better, but has too high VIF
car::vif(m2)
##      total_grants  data_grant_share I(total_grants^2) 
##          6.413794          1.001205          6.410746
# so we stick with model 1

# however, there might be an interaction between the share of grants and the
# total number of grants
m3 <- lm(n_publications ~ 
           arm::rescale(total_grants) * 
           arm::rescale(data_grant_share),
         data = df_clean)
summary(m3)
## 
## Call:
## lm(formula = n_publications ~ arm::rescale(total_grants) * arm::rescale(data_grant_share), 
##     data = df_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -122.390  -12.998    0.184   13.147  155.050 
## 
## Coefficients:
##                                                            Estimate Std. Error
## (Intercept)                                                614.9651     0.2115
## arm::rescale(total_grants)                                1357.9269     0.4335
## arm::rescale(data_grant_share)                             -23.8306     0.9899
## arm::rescale(total_grants):arm::rescale(data_grant_share)  -70.2500     3.6110
##                                                           t value Pr(>|t|)    
## (Intercept)                                               2908.25   <2e-16 ***
## arm::rescale(total_grants)                                3132.20   <2e-16 ***
## arm::rescale(data_grant_share)                             -24.07   <2e-16 ***
## arm::rescale(total_grants):arm::rescale(data_grant_share)  -19.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.99 on 11996 degrees of freedom
## Multiple R-squared:  0.9989, Adjusted R-squared:  0.9989 
## F-statistic: 3.477e+06 on 3 and 11996 DF,  p-value: < 2.2e-16
AIC(m3, m1)
##    df      AIC
## m3  5 109301.5
## m1  4 109672.3
# m3 is clearly better
df_nested <- df_clean %>% 
  group_by(pubs.vs.data) %>% 
  nest()

modeling_fun <- function(df) {
  m <- lm(n_publications ~ total_grants*data_grant_share,
     data = df)
  m
}

df_nested <- df_nested %>% 
  mutate(model = map(data, modeling_fun),
         glance = map(model, broom::glance),
         tidy = map(model, broom::tidy, conf.int = TRUE))
df_nested  
## # A tibble: 6 x 5
## # Groups:   pubs.vs.data [6]
##   pubs.vs.data data                 model  glance            tidy            
##          <dbl> <list>               <list> <list>            <list>          
## 1          0.4 <tibble [2,000 x 8]> <lm>   <tibble [1 x 12]> <tibble [4 x 7]>
## 2          0.2 <tibble [2,000 x 8]> <lm>   <tibble [1 x 12]> <tibble [4 x 7]>
## 3          0.6 <tibble [2,000 x 8]> <lm>   <tibble [1 x 12]> <tibble [4 x 7]>
## 4          0.8 <tibble [2,000 x 8]> <lm>   <tibble [1 x 12]> <tibble [4 x 7]>
## 5          1   <tibble [2,000 x 8]> <lm>   <tibble [1 x 12]> <tibble [4 x 7]>
## 6          0   <tibble [2,000 x 8]> <lm>   <tibble [1 x 12]> <tibble [4 x 7]>
df_nested %>% 
  unnest(glance) 
## # A tibble: 6 x 16
## # Groups:   pubs.vs.data [6]
##   pubs.vs.data data  model r.squared adj.r.squared sigma statistic p.value    df
##          <dbl> <lis> <lis>     <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>
## 1          0.4 <tib~ <lm>      0.999         0.999  22.5   601588.       0     3
## 2          0.2 <tib~ <lm>      0.999         0.999  22.5   569099.       0     3
## 3          0.6 <tib~ <lm>      0.999         0.999  22.9   656012.       0     3
## 4          0.8 <tib~ <lm>      0.999         0.999  23.1   647773.       0     3
## 5          1   <tib~ <lm>      0.999         0.999  22.6   690781.       0     3
## 6          0   <tib~ <lm>      0.998         0.998  23.4   368577.       0     3
## # ... with 7 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>,
## #   deviance <dbl>, df.residual <int>, nobs <int>, tidy <list>
# plot it
dummy_data <- tibble(
  total_grants = rep(1:50, 5),
  data_grant_share = rep(c(0, .25, .5, .75, 1), each = 50)
)

df_nested %>% 
  mutate(new_data = list(dummy_data)) %>% 
  mutate(prediction = map2(model, new_data, ~broom::augment(.x, newdata = .y))) %>% 
  unnest(prediction) %>% 
  ggplot(aes(total_grants, .fitted, colour = as.factor(data_grant_share))) +
  geom_line() +
  geom_point() +
  facet_wrap(vars(pubs.vs.data)) +
  labs(y = "Predicted number of publications",
       colour = "% of grants mandating data sharing",
       title = "Productivity depending on data sharing incentives") +
  theme(legend.position = "top")

As you incentivise data sharing this way, those groups that actually solely get funded by funders mandating OD fare worse than others.

explanation for the lower inequality in the system: actors get disadvantaged and therefore randomness takes a larger share of the system

important to note: early on there seems to be an advantage, but it quickly goes away and turns into a disadvantage.

so the interesting thing could be to give agents strategies (always share data if last n grants had data sharing, so to have a longer term effect, vs maybe also simply directly adhering), and then to see which share of funders mandating data we need to see a change.

also, maybe having those that always share data, and seeing how they are affected

Over time

–> See exploration script for now